# Purpose:
# Replicate the post hoc standardized residual analysis for the disfluency-by-cognitive-process
# contingency table, using participant-stratified permutation to derive empirical p-values.

library(readr)
library(dplyr)

input_file <- "data/change to data file name"
residuals_file <- "outputs/03_standardized_residuals.csv"
pvalues_file <- "outputs/03_empirical_pvalues.csv"

df <- read_csv(input_file, show_col_types = FALSE)

required_cols <- c("participant_id", "tc", "disf_loc")
missing_cols <- setdiff(required_cols, names(df))
if (length(missing_cols) > 0) {
  stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

df <- df %>%
  mutate(
    participant_id = as.factor(participant_id),
    tc = as.factor(tc),
    disf_loc = as.factor(disf_loc)
  )

obs_tab <- table(df$disf_loc, df$tc)
expected_tab <- outer(rowSums(obs_tab), colSums(obs_tab)) / sum(obs_tab)
expected_tab_safe <- ifelse(expected_tab == 0, 1e-10, expected_tab)
obs_resid <- (obs_tab - expected_tab) / sqrt(expected_tab_safe)

n_perm <- 1000
set.seed(123)

rnames <- rownames(obs_tab)
cnames <- colnames(obs_tab)
resid_array <- array(NA_real_, dim = c(length(rnames), length(cnames), n_perm))

for (i in seq_len(n_perm)) {
  perm_df <- df %>%
    group_by(participant_id) %>%
    mutate(tc_perm = sample(tc)) %>%
    ungroup()

  perm_tab <- table(perm_df$disf_loc, perm_df$tc_perm)
  perm_exp <- outer(rowSums(perm_tab), colSums(perm_tab)) / sum(perm_tab)
  perm_exp_safe <- ifelse(perm_exp == 0, 1e-10, perm_exp)
  perm_resid <- (perm_tab - perm_exp) / sqrt(perm_exp_safe)

  resid_array[, , i] <- perm_resid
}

emp_p <- matrix(
  NA_real_,
  nrow = length(rnames),
  ncol = length(cnames),
  dimnames = list(rnames, cnames)
)

for (i in seq_along(rnames)) {
  for (j in seq_along(cnames)) {
    obs_val <- obs_resid[i, j]
    null_vals <- resid_array[i, j, ]
    emp_p[i, j] <- mean(abs(null_vals) >= abs(obs_val))
  }
}

write.csv(as.data.frame.matrix(round(obs_resid, 4)), residuals_file, row.names = TRUE)
write.csv(as.data.frame.matrix(round(emp_p, 4)), pvalues_file, row.names = TRUE)
